COVID-19 Data Analysis

BEST COUNTRIES

Data repository: link

Jupyter Notebook repository: link

In [1]:
import io
import json
import requests
import datetime as dt

import numpy as np
import plotly.graph_objects as go
import plotly.offline as pyo

from matplotlib.dates import date2num, num2date

from scipy.optimize import curve_fit
from IPython.display import Markdown, display, Math
from scipy import stats as sps
from scipy.interpolate import interp1d
import pandas as pd

pyo.init_notebook_mode()
In [2]:
json_countries = "https://raw.githubusercontent.com/maxdevblock/covid-19-time-series/master/json/COVID-COUNTRIES.json"
with requests.get(json_countries) as req:
    data = json.loads(req.content.decode('utf-8-sig'))
In [3]:
define_p = [.95, .5]

# We create an array for every possible value of Rt
R_T_MAX = 12
r_t_range = np.linspace(0, R_T_MAX, R_T_MAX*100+1)

# Gamma is 1/serial interval
# https://wwwnc.cdc.gov/eid/article/26/7/20-0282_article
# https://www.nejm.org/doi/full/10.1056/NEJMoa2001316
GAMMA = 1/7.5


def get_df(_data, country):
    csv = "data,cases\n"
    for i, l in enumerate(_data[country]["Confirmed"]):
        n = _data[country]["Confirmed"][l]
        csv += f"{l},{n}"
        if i < len(_data[country]["Confirmed"]) - 1:
            csv += "\n"
    data = None
    data = pd.read_csv(io.StringIO(csv),
                         usecols=['data', 'cases'],
                         parse_dates=['data'],
                         index_col=['data'],
                         squeeze=True).sort_index()
    return data


def prepare_cases(cases, cutoff=1, std=2):

    new_cases = cases.diff()
    new_cases[new_cases < 0] = 0

    smoothed = new_cases.rolling(7,
        win_type='gaussian',
        min_periods=1,
        center=True).mean(std=std).round()
    smoothed[smoothed < 0] = 0
    
    idx_start = np.searchsorted(smoothed, cutoff)
    
    smoothed = smoothed.iloc[idx_start:]
    original = new_cases.loc[smoothed.index]
    
    return original, smoothed

def highest_density_interval(pmf, p=.9):
    # If we pass a DataFrame, just call this recursively on the columns
    if(isinstance(pmf, pd.DataFrame)):
        return pd.DataFrame([highest_density_interval(pmf[col], p=p) for col in pmf],
                            index=pmf.columns)
    
    cumsum = np.cumsum(pmf.values)
    
    # N x N matrix of total probability mass for each low, high
    total_p = cumsum - cumsum[:, None]
    
    # Return all indices with total_p > p
    lows, highs = np.array([]), np.array([])
    j = 0
    while not lows.size or not highs.size:
        lows, highs = (total_p > (p-j)).nonzero()
        j += .05
    
    # Find the smallest range (highest density)
    best = (highs - lows).argmin()
    
    low = pmf.index[lows[best]]
    high = pmf.index[highs[best]]
    
    return pd.Series([low, high],
                     index=[f'Low_{p*100:.0f}',
                            f'High_{p*100:.0f}'])


def HDI_of_grid(probMassVec, credMass=0.95):
    sortedProbMass = np.sort(probMassVec, axis=None)[::-1]
    HDIheightIdx = np.min(np.where(np.cumsum(sortedProbMass) >= credMass))
    HDIheight = sortedProbMass[HDIheightIdx]
    HDImass = np.sum(probMassVec[probMassVec >= HDIheight])
    idx = np.where(probMassVec >= HDIheight)[0]
    return {'indexes':idx, 'mass':HDImass, 'height':HDIheight}


def HDI_of_grid_from_df(pmf, p):
    # If we pass a DataFrame, just call this recursively on the columns
    if(isinstance(pmf, pd.DataFrame)):
        return pd.DataFrame([HDI_of_grid_from_df(pmf[col], p=p) for col in pmf],
                            index=pmf.columns)
    res = HDI_of_grid(pmf, p)
    #print(res["indexes"])
    lo_idx = res["indexes"][0]
    hi_idx = res["indexes"][-1]
    
    lo = pmf.index[lo_idx]
    hi = pmf.index[hi_idx]
    
    return pd.Series([lo, hi],
                     index=[f'Low_{p*100:.0f}',
                            f'High_{p*100:.0f}'])


def HDIs(pmf, P=[.95, .5]):
    RES = []
    for p in P:
        res = HDI_of_grid_from_df(pmf, p=p)
        RES.append(res)
    return RES

def get_posteriors(sr, sigma=0.15):

    # (1) Calculate Lambda
    lam = sr[:-1].values * np.exp(GAMMA * (r_t_range[:, None] - 1))

    
    # (2) Calculate each day's likelihood
    likelihoods = pd.DataFrame(
        data = sps.poisson.pmf(sr[1:].values, lam),
        index = r_t_range,
        columns = sr.index[1:])
    
    # (3) Create the Gaussian Matrix
    process_matrix = sps.norm(loc=r_t_range,
                              scale=sigma
                             ).pdf(r_t_range[:, None]) 

    # (3a) Normalize all rows to sum to 1
    process_matrix /= process_matrix.sum(axis=0)
    
    # (4) Calculate the initial prior
    prior0 = sps.gamma(a=4).pdf(r_t_range)
    prior0 /= prior0.sum()

    # Create a DataFrame that will hold our posteriors for each day
    # Insert our prior as the first posterior.
    posteriors = pd.DataFrame(
        index=r_t_range,
        columns=sr.index,
        data={sr.index[0]: prior0}
    )
    
    # We said we'd keep track of the sum of the log of the probability
    # of the data for maximum likelihood calculation.
    log_likelihood = 0.0

    # (5) Iteratively apply Bayes' rule
    for previous_day, current_day in zip(sr.index[:-1], sr.index[1:]):

        #(5a) Calculate the new prior
        current_prior = process_matrix @ posteriors[previous_day]
        
        #(5b) Calculate the numerator of Bayes' Rule: P(k|R_t)P(R_t)
        numerator = likelihoods[current_day] * current_prior
        
        #(5c) Calcluate the denominator of Bayes' Rule P(k)
        denominator = np.sum(numerator)
        
        # Execute full Bayes' Rule
        posteriors[current_day] = numerator/denominator
        
        # Add to the running sum of log likelihoods
        log_likelihood += np.log(denominator)
    
    return posteriors, log_likelihood

def plotly_rt(result, name):
    index = result['ML'].index.get_level_values('data')
    values = result['ML'].values

    # Aesthetically, extrapolate credible interval by 1 day either side
    lofn = interp1d(date2num(index),
                     result['Low_95'].values,
                     bounds_error=False,
                     fill_value='extrapolate')

    hifn = interp1d(date2num(index),
                      result['High_95'].values,
                      bounds_error=False,
                      fill_value='extrapolate')
    
    # Aesthetically, extrapolate credible interval by 1 day either side
    lofn50 = interp1d(date2num(index),
                     result['Low_50'].values,
                     bounds_error=False,
                     fill_value='extrapolate')

    hifn50 = interp1d(date2num(index),
                      result['High_50'].values,
                      bounds_error=False,
                      fill_value='extrapolate')

    extended = pd.date_range(start=index[0], end=index[-1]+pd.Timedelta(days=1))
    hyperextended = pd.date_range(start=index[0]-pd.Timedelta(days=1), end=index[-1]+pd.Timedelta(days=2))
        
    fig = go.Figure()
    
    fig.add_trace(
        go.Scatter(
            x=[index[0]-pd.Timedelta(days=10), index[-1]+pd.Timedelta(days=10)], y=[1, 1],
            line = dict(color='black', width=1), showlegend=False,
        )
    )
    fig.add_trace(
        go.Scatter(
            x=index, y=lofn50(date2num(extended)),
            line_color="rgba(0,0,0,.25)", showlegend=False,
            name="lo50 R<sub>t</sub>"
        )
    )
    fig.add_trace(
        go.Scatter(
            x=index, y=hifn50(date2num(extended)),
            line_color="rgba(0,0,0,.25)", showlegend=False,
            fill="tonexty", fillcolor="rgba(0,0,0,.1)",
            name="hi50 R<sub>t</sub>"
        )
    )
    fig.add_trace(
        go.Scatter(
            x=index, y=lofn(date2num(extended)),
            line_color="rgba(0,0,0,.1)", showlegend=False,
            name="lo95 R<sub>t</sub>"
        )
    )
    fig.add_trace(
        go.Scatter(
            x=index, y=hifn(date2num(extended)),
            line_color="rgba(0,0,0,.1)", showlegend=False,
            fill="tonexty", fillcolor="rgba(0,0,0,.1)",
            name="hi95 R<sub>t</sub>"
        )
    )
    
    fig.add_trace(
        go.Scatter(
            x=index, y=values,
            marker=dict(
                size=7,
                line=dict(width=1, color="black"),
                color=values,
                cmin=0,
                cmax=max(values),
                colorbar=dict(
                    title="R<sub>t</sub>"
                ),
                colorscale=[
                    [0., "rgba(0,150,0,1)"],
                    [1./max(values), "rgba(255, 255, 255, 1)"],
                    [1., "rgba(255,0,0,1)"],
                ]
            ),
            mode="markers+lines", showlegend=False,
            line = dict(color='grey', width=2, dash='dot'),
            name="R<sub>t</sub>",
        )
    )
    fig.update_layout(legend_orientation="h",
        showlegend=True, plot_bgcolor='rgba(0,0,0,0)', 
        yaxis={"gridcolor": '#bdbdbd', "zerolinecolor": '#969696', "range":[0, 5]},
        xaxis={"gridcolor": '#bdbdbd', "range":[hyperextended[0], hyperextended[-1]]},
        title={"text": "Real time {} R<sub>t</sub>".format(name), "xanchor": "center", "x": 0.5},
        yaxis_title="R<sub>t</sub>", hovermode="x unified"
    )
    pyo.iplot(fig)
In [4]:
print("FIRST ENTRY DATE: {}".format(
    list( data["US"]["Confirmed"].keys() )[0]
    )
)
print("LAST  ENTRY DATE: {}".format(
    list( data["US"]["Confirmed"].keys() )[-1]
    )
)
period = (
    dt.datetime.strptime(list(data["US"]["Confirmed"].keys())[-1], "%Y-%m-%d") -
    dt.datetime.strptime(list(data["US"]["Confirmed"].keys())[0], "%Y-%m-%d")
).days

print("COVERAGE: {} days".format(period))
print("CURRENT DATE IS: {}".format(dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
FIRST ENTRY DATE: 2020-01-22
LAST  ENTRY DATE: 2020-08-05
COVERAGE: 196 days
CURRENT DATE IS: 2020-08-06 20:43:18
In [5]:
def get_best_countries(json_data=None, number=25):
    LAST_CONFIRMED = {}
    for country in json_data:
        last_date = sorted(json_data[country]["Confirmed"])[-1]
        last_confirmed = json_data[country]["Confirmed"][last_date]
        LAST_CONFIRMED.update({
            country: last_confirmed
        })

    LAST_CONFIRMED = {k: v for k, v in sorted(LAST_CONFIRMED.items(), key=lambda item: item[1], reverse=True)}

    return sorted(list(LAST_CONFIRMED)[:number])
In [6]:
best_countries = get_best_countries(json_data=data)
print(best_countries)
['Argentina', 'Bangladesh', 'Brazil', 'Canada', 'Chile', 'Colombia', 'France', 'Germany', 'India', 'Indonesia', 'Iran', 'Iraq', 'Italy', 'Mexico', 'Pakistan', 'Peru', 'Philippines', 'Qatar', 'Russia', 'Saudi Arabia', 'South Africa', 'Spain', 'Turkey', 'US', 'United Kingdom']
In [7]:
x = []   # datetime x array
_x = []  # integer x array
yC = {}  # new confirmed cases array
yD = {}  # new deaths array
yR = {}  # new recovered array
yP = {}  # new infected array

TOTyC = {}  # confirmed cases array
TOTyD = {}  # deaths array
TOTyR = {}  # recovered array
TOTyP = {}  # infected array
In [8]:
for day in data[best_countries[0]]["Confirmed"]:
    # x values
    x.append(dt.datetime.strptime(day, "%Y-%m-%d"))
    _x.append(len(x) - 1)

for country in best_countries:
    yC.update({country: np.array([])})
    yD.update({country: np.array([])})
    yR.update({country: np.array([])})
    yP.update({country: np.array([])})
    TOTyC.update({country: []})
    TOTyD.update({country: []})
    TOTyR.update({country: []})
    TOTyP.update({country: []})
    for i, day in enumerate(data[country]["Confirmed"]):
        if i:
            # dy values
            yC[country] = np.append(yC[country], data[country]["Confirmed"][day] - TOTyC[country][-1])
            yD[country] = np.append(yD[country], data[country]["Deaths"][day] - TOTyD[country][-1])
            yR[country] = np.append(yR[country], data[country]["Recovered"][day] - TOTyR[country][-1])
            yP[country] = np.append(yP[country],
                (data[country]["Confirmed"][day] - data[country]["Deaths"][day] - data[country]["Recovered"][day]) -
                TOTyP[country][-1]
            )
        # y TOT values
        TOTyC[country].append(data[country]["Confirmed"][day])
        TOTyD[country].append(data[country]["Deaths"][day])
        TOTyR[country].append(data[country]["Recovered"][day])
        TOTyP[country].append(data[country]["Confirmed"][day] - data[country]["Deaths"][day] - data[country]["Recovered"][day])
In [9]:
menu = "#### COUNTRIES:\n"
for country in best_countries:
    menu += "- [{}](#{})\n".format(country, country.replace(" ", "-"))

display(Markdown(menu))
In [10]:
#google = pd.read_csv(
#    "https://www.gstatic.com/covid19/mobility/Global_Mobility_Report.csv",
#    parse_dates=['date'], dtype={"sub_region_1": str, "sub_region_2": str},
#    index_col=["date"]
#)
google = pd.read_pickle("google-mobility.pkl")
In [11]:
_R0 = {}

for country in best_countries:
    
    display(Markdown("## {}".format(country)))
    
    
    fig = go.Figure()

    fig.add_trace(
        go.Bar(
            name="Infected", x=x, y=TOTyP[country], marker_color="cyan"
        )
    )
    fig.add_trace(
        go.Bar(
            name="Recovered", x=x, y=TOTyR[country], marker_color="lime"
        )
    )
    fig.add_trace(
        go.Bar(
            name="Deaths", x=x, y=TOTyD[country], marker_color="red"
        )
    )
    fig.add_trace(
        go.Scatter(
            name="Total", x=x, y=TOTyC[country], marker_color="blue", line_shape='spline',
        )
    )

    fig.update_layout(
        showlegend=True, plot_bgcolor='rgba(0,0,0,0)', 
        xaxis={"gridcolor": '#bdbdbd'},
        title={"text": "{} overview".format(country), "xanchor": "center", "x": 0.5},
        hovermode="x unified", barmode='stack'
    )
    fig.update_yaxes(title_text="number", gridcolor='#bdbdbd')
    pyo.iplot(fig)    
    
    
    fig = go.Figure()
    # confirmed
    fig.add_trace(go.Scatter(
        x=x[1:], y=yC[country], 
        mode='lines+markers',
        marker_color="blue", marker_size=5, marker_symbol="circle", marker_line_width=1,
        line_shape='spline',
        legendgroup="cases", name="new cases/day"
    ))
    # deaths
    fig.add_trace(go.Scatter(
        x=x[1:], y=yD[country], 
        mode='lines+markers',
        marker_color="red", marker_size=5, marker_symbol="diamond", marker_line_width=1,
        line_shape='spline',
        legendgroup="deaths", name="new deaths/day"
    ))
    # recovered
    fig.add_trace(
        go.Scatter(
        x=x[1:], y=yR[country], 
        mode='lines+markers',
        marker_color="lime", marker_size=5, marker_symbol="square", marker_line_width=1,
        line_shape='spline',
        legendgroup="recovered", name="new recovered/day"
    ))
    # infected
    fig.add_trace(go.Scatter(
        x=x[1:], y=yP[country], 
        mode='lines+markers',
        marker_color="lightskyblue", marker_size=5, marker_symbol="square",
        line_shape='spline',
        legendgroup="infected", name="new infected/day"
    ))
    fig.update_layout(legend_orientation="h",
        showlegend=True, plot_bgcolor='rgba(0,0,0,0)', 
        yaxis={"gridcolor": '#bdbdbd', "zerolinecolor": '#969696'},
        xaxis={"gridcolor": '#bdbdbd'},
        title={"text": "{}, daily increase".format(country), "xanchor": "center", "x": 0.5},
        yaxis_title="new/day",
    )
    pyo.iplot(fig)
    
    try:
        df = get_df(data, country)

        original, smoothed = prepare_cases(df, std=2.5)

        # Note that we're fixing sigma to a value just for the example
        posteriors, log_likelihood = get_posteriors(smoothed, sigma=.20)

        # Note that this takes a while to execute - it's not the most efficient algorithm
        HDIS = HDIs(posteriors)
        #hdis = highest_density_interval(posteriors, p=.9)
        most_likely = posteriors.idxmax().rename('ML')
        result = most_likely
        for hdis in HDIS:
            result = pd.concat([result, hdis], axis=1)


        # Look into why you shift -1
        #result = pd.concat([most_likely, hdis], axis=1)

        plotly_rt(result, country)
    except Exception as err:
        print(f"ERROR with Ro: {err}")
    
    if country == "US":
        country = "United States"
        
    _df = google.loc[google["country_region"] == country]
    _df = _df.loc[_df.fillna("NONE")["sub_region_1"] == "NONE"]
    
    fig = go.Figure()
    for column in _df.columns[4:]:
        fig.add_trace(
            go.Scatter(
                x=_df.index,
                y=_df[column],
                name=column.replace("_", " ").title().split(" Percent")[0]
            )
        )
    fig.update_layout(
        legend_orientation="h",
        showlegend=True, plot_bgcolor='rgba(0,0,0,0)', 
        yaxis={"gridcolor": '#bdbdbd', "zerolinecolor": '#969696'},
        xaxis={"gridcolor": '#bdbdbd'},
        title={"text": f"{country} mobility (last update {_df.index[-1].date() if _df.index.size else 'None'})", "xanchor": "center", "x": 0.5},
        yaxis_title="Percentage Change from Baseline", hovermode="x unified"
    )
    pyo.iplot(fig)
    
    df = None
    _df = None
    
    display(Markdown("---"))

# with open("R0_countries.json", "w") as f:
#    json.dump(_R0, f)

Argentina


Bangladesh


Brazil


Canada


Chile


Colombia


France


Germany


India


Indonesia


Iran


Iraq


Italy


Mexico

/Users/massimopierini/opt/anaconda3/envs/covid/lib/python3.7/site-packages/ipykernel_launcher.py:162: RuntimeWarning:

divide by zero encountered in log

/Users/massimopierini/opt/anaconda3/envs/covid/lib/python3.7/site-packages/ipykernel_launcher.py:78: RuntimeWarning:

invalid value encountered in greater_equal

ERROR with Ro: zero-size array to reduction operation minimum which has no identity

Pakistan

ERROR with Ro: zero-size array to reduction operation minimum which has no identity

Peru


Philippines


Qatar


Russia


Saudi Arabia


South Africa


Spain


Turkey


US


United Kingdom

ERROR with Ro: zero-size array to reduction operation minimum which has no identity

In [ ]:
 

© 2020 Max Pierini & Sandra Mazzoli & Alessio Pamovio

Exported from countries/single.ipynb committed by Max Pierini on Mon Aug 31 10:59:12 2020 revision 1, 8eb7ef8